static const char* clKernelsString = \
	"//////////////////////////////////////////////////////////////////////////////////////\n"
	"// Private constants\n"
	"typedef uint2 KeyValuePair;\n"
	"\n"
	"\n"
	"\n"
	"//#pragma OPENCL EXTENSION cl_khr_local_int32_base_atomics : enable\n"
	"\n"
	"\n"
	"// May want to factor BITS_PER_PASS out\n"
	"#define BITS_PER_PASS 4\n"
	"\n"
	"#define GROUP_SIZE 128\n"
	"#define ELEMENTS_PER_WORK_ITEM 4\n"
	"#define SCAN_WORKGROUP_SIZE 128\n"
	"#define BLOCK_SUM_WORKGROUP_SIZE 128\n"
	"#define NUM_BLOCK_SUM_ELEMENTS_PER_ITEM ((GRID_SIZE+255)/BLOCK_SUM_WORKGROUP_SIZE)\n"
	"\n"
	"\n"
	"#define LOG_NUM_BANKS 5\n"
	"#define NUM_LOCAL_BANKS (1<<LOG_NUM_BANKS)\n"
	"\n"
	"\n"
	"// Remove the top one and replace with the others, then can put the tuned versions back in\n"
	"#if 1\n"
	"#define CONVERT_CONFLICT_FREE(in) (in)\n"
	"#define CONVERT_CONFLICT_FREE_2VEC(in) (in)\n"
	"#define CONVERT_CONFLICT_FREE_4VEC(in) (in)\n"
	"#else\n"
	"#define CONVERT_CONFLICT_FREE(in) (in + (in>>LOG_NUM_BANKS))\n"
	"#define CONVERT_CONFLICT_FREE_2VEC(in) (intermediate.x = in.x>>LOG_NUM_BANKS, intermediate.y = in.y>>LOG_NUM_BANKS, in + intermediate)\n"
	"#define CONVERT_CONFLICT_FREE_4VEC(in) (intermediate.x = in.x>>LOG_NUM_BANKS, intermediate.y = in.y>>LOG_NUM_BANKS, intermediate.z = in.z>>LOG_NUM_BANKS, intermediate.w = in.w>>LOG_NUM_BANKS, in + intermediate)\n"
	"#endif\n"
	"\n"
	"#define BANK_CONFLICT_RESOLUTION_PADDING (CONVERT_CONFLICT_FREE(GROUP_SIZE*ELEMENTS_PER_WORK_ITEM)-GROUP_SIZE*ELEMENTS_PER_WORK_ITEM)\n"
	"\n"
	"\n"
	"\n"
	"\n"
	"////////////////////////////////////////////////////////////////////////////////////////\n"
	"// Prefixsum functions\n"
	"\n"
	"/**\n"
	" * Perform work efficient prefix sum in local memory in place.\n"
	" */\n"
	"void localPrefixSum(  __local unsigned *prefixSums, unsigned numElements )\n"
	"{\n"
	"    // Iterate over halving sizes of the element set performing reduction phase of scan\n"
	"\n"
	"    int offset = 1;\n"
	"    //for( int level = get_local_size(0); level > 0; level >>= 1 )\n"
	"    for( int level = numElements>>1; level > 0; level >>= 1 )\n"
	"    //for( int level = 1; level > 0; level >>= 1 )\n"
	"    {\n"
	"        barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"        for( int sumElement = get_local_id(0); sumElement < level; sumElement += get_local_size(0) )\n"
	"        {\n"
	"            //int ai = offset*(2*sumElement+1)-1;\n"
	"            //int bi = offset*(2*sumElement+2)-1;\n"
	"\n"
	"            int i = 2*offset*sumElement;\n"
	"            int ai = i + offset - 1;\n"
	"            int bi = ai + offset;\n"
	"\n"
	"            ai = CONVERT_CONFLICT_FREE(ai);\n"
	"            bi = CONVERT_CONFLICT_FREE(bi);\n"
	"            prefixSums[bi] += prefixSums[ai];\n"
	"        }\n"
	"        offset <<= 1;\n"
	"    }\n"
	"\n"
	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    // Need to clear the last element\n"
	"    if( get_local_id(0) == 0 )\n"
	"        prefixSums[ CONVERT_CONFLICT_FREE(numElements-1) ] = 0;\n"
	"\n"
	"    for( int level = 1; level < numElements; level <<= 1 )\n"
	"    {\n"
	"        offset >>= 1;\n"
	"        barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"        for( int sumElement = get_local_id(0); sumElement < level; sumElement += get_local_size(0) )\n"
	"        {\n"
	"            int ai = offset*(2*sumElement+1)-1;\n"
	"            int bi = offset*(2*sumElement+2)-1;\n"
	"\n"
	"            ai = CONVERT_CONFLICT_FREE(ai);\n"
	"            bi = CONVERT_CONFLICT_FREE(bi);\n"
	"            unsigned temporary = prefixSums[ai];\n"
	"            prefixSums[ai] = prefixSums[bi];\n"
	"            prefixSums[bi] += temporary;\n"
	"        }\n"
	"    }\n"
	"}\n"
	"\n"
	"// Barrier free because we know that this never gets wider than a WF\n"
	"void BClocalPF(  __local unsigned *prefixSums )\n"
	"{\n"
	"    // Iterate over halving sizes of the element set performing reduction phase of scan\n"
	"    int offset = 1;\n"
	"\n"
	"    for( int level = 8; level > 0; level >>= 1 )\n"
	"    {\n"
	"        for( int sumElement = get_local_id(0); sumElement < level; sumElement += get_local_size(0) )\n"
	"        {\n"
	"            int ai = offset*(2*sumElement+1)-1;\n"
	"            int bi = offset*(2*sumElement+2)-1;\n"
	"            prefixSums[bi] += prefixSums[ai];\n"
	"        }\n"
	"        offset <<= 1;\n"
	"    }\n"
	"\n"
	"\n"
	"    // Need to clear the last element\n"
	"    if( get_local_id(0) == 0 )\n"
	"        prefixSums[ 15 ] = 0;\n"
	"\n"
	"    for( int level = 1; level < 16; level <<= 1 )\n"
	"    {\n"
	"        offset >>= 1;\n"
	"\n"
	"        for( int sumElement = get_local_id(0); sumElement < level; sumElement += get_local_size(0) )\n"
	"        {\n"
	"            int ai = offset*(2*sumElement+1)-1;\n"
	"            int bi = offset*(2*sumElement+2)-1;\n"
	"            unsigned temporary = prefixSums[ai];\n"
	"            prefixSums[ai] = prefixSums[bi];\n"
	"            prefixSums[bi] += temporary;\n"
	"        }\n"
	"    }\n"
	"}\n"
	"\n"
	"\n"
	"\n"
	"/**\n"
	" * Optimised prefix sum that takes 4 elements per WI and assumes 128 WIs in\n"
	" * a group.\n"
	" * Also depends on WF size of 64 elements. Will need minor adaptation to scale down.\n"
	" */\n"
	"uint4 localPrefixSumBlock( uint4 prefixSumData, __local unsigned *prefixSums )\n"
	"{\n"
	"    uint4 originalData = prefixSumData;\n"
	"\n"
	"    // Do sum across vector in two stages\n"
	"    prefixSumData.y += prefixSumData.x;\n"
	"    prefixSumData.w += prefixSumData.z;\n"
	"\n"
	"    prefixSumData.z += prefixSumData.y;\n"
	"    prefixSumData.w += prefixSumData.y;\n"
	"\n"
	"    // Now just 128 values, each sum of a block of 4\n"
	"    prefixSums[get_local_id(0)] = 0;\n"
	"    prefixSums[get_local_id(0)+128] = prefixSumData.w;\n"
	"\n"
	"    // TODO: Could get rid of these barriers if get both WFs working but on different halves of data\n"
	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    // Do for single WF as we only have 128 values to deal with = 64*2\n"
	"/*\n"
	"	//	lh.todo. th. doesn\'t work on v2.2 because compiler mess this up. Update of prefixSums[idx] is only done at the end of this {}.\n"
	"	//	1. orig ver\n"
	"    if( get_local_id(0) < 64 )\n"
	"    {\n"
	"        int idx = 2*get_local_id(0) + 129;\n"
	"        prefixSums[idx] += prefixSums[idx-1];\n"
	"        prefixSums[idx] += prefixSums[idx-2];\n"
	"        prefixSums[idx] += prefixSums[idx-4];\n"
	"        prefixSums[idx] += prefixSums[idx-8];\n"
	"        prefixSums[idx] += prefixSums[idx-16];\n"
	"        prefixSums[idx] += prefixSums[idx-32];\n"
	"        prefixSums[idx] += prefixSums[idx-64];\n"
	"        // Propagate intermediate values through\n"
	"        prefixSums[idx-1] += prefixSums[idx-2];\n"
	"    }\n"
	"*/\n"
	"	//	2. using mem fence\n"
	"    if( get_local_id(0) < 64 )\n"
	"    {\n"
	"        int idx = 2*get_local_id(0) + 129;\n"
	"        prefixSums[idx] += prefixSums[idx-1];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"        prefixSums[idx] += prefixSums[idx-2];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"        prefixSums[idx] += prefixSums[idx-4];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"        prefixSums[idx] += prefixSums[idx-8];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"        prefixSums[idx] += prefixSums[idx-16];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"        prefixSums[idx] += prefixSums[idx-32];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"        prefixSums[idx] += prefixSums[idx-64];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"        // Propagate intermediate values through\n"
	"        prefixSums[idx-1] += prefixSums[idx-2];\n"
	"    }\n"
	"/*\n"
	"	//	this should work on v2.2 too. Checked assembly but doesnt run as I expect.\n"
	"	//	3. using volatile (doesn\'t work somehow)\n"
	"	volatile __local unsigned* ps = prefixSums;\n"
	"    if( get_local_id(0) < 64 )\n"
	"    {\n"
	"        int idx = 2*get_local_id(0) + 129;\n"
	"        ps[idx] += ps[idx-1];\n"
	"        ps[idx] += ps[idx-2];\n"
	"        ps[idx] += ps[idx-4];\n"
	"        ps[idx] += ps[idx-8];\n"
	"        ps[idx] += ps[idx-16];\n"
	"        ps[idx] += ps[idx-32];\n"
	"        ps[idx] += ps[idx-64];\n"
	"        // Propagate intermediate values through\n"
	"        ps[idx-1] += ps[idx-2];\n"
	"    }\n"
	"*/\n"
	"/*\n"
	"	//	4. using barrier\n"
	"    {\n"
	"        int idx = 2*get_local_id(0) + 129;\n"
	"	    if( get_local_id(0) < 64 )\n"
	"		{\n"
	"	        prefixSums[idx] += prefixSums[idx-1];\n"
	"		}\n"
	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
	"	    if( get_local_id(0) < 64 )\n"
	"		{\n"
	"	        prefixSums[idx] += prefixSums[idx-2];\n"
	"		}\n"
	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
	"	    if( get_local_id(0) < 64 )\n"
	"		{\n"
	"	        prefixSums[idx] += prefixSums[idx-4];\n"
	"		}\n"
	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
	"	    if( get_local_id(0) < 64 )\n"
	"		{\n"
	"	        prefixSums[idx] += prefixSums[idx-8];\n"
	"		}\n"
	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
	"	    if( get_local_id(0) < 64 )\n"
	"		{\n"
	"	        prefixSums[idx] += prefixSums[idx-16];\n"
	"		}\n"
	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
	"	    if( get_local_id(0) < 64 )\n"
	"		{\n"
	"	        prefixSums[idx] += prefixSums[idx-32];\n"
	"		}\n"
	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
	"	    if( get_local_id(0) < 64 )\n"
	"		{\n"
	"	        prefixSums[idx] += prefixSums[idx-64];\n"
	"        // Propagate intermediate values through\n"
	"		}\n"
	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
	"	    if( get_local_id(0) < 64 )\n"
	"		{\n"
	"	        prefixSums[idx-1] += prefixSums[idx-2];\n"
	"		}\n"
	"    }\n"
	"*/\n"
	"\n"
	"    barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    // Grab and propagate for whole WG - loading the - 1 value\n"
	"    uint addValue = prefixSums[get_local_id(0)+127];\n"
	"\n"
	"    // Propagate item level sum across vector\n"
	"    prefixSumData += (uint4)(addValue, addValue, addValue, addValue);\n"
	"\n"
	"    // And return the final value which is the final sum\n"
	"    return prefixSumData - originalData;\n"
	"}\n"
	"\n"
	"\n"
	"\n"
	"\n"
	"\n"
	"\n"
	"\n"
	"\n"
	"\n"
	"\n"
	"\n"
	"\n"
	"////////////////////////////////////////////////////////////////////////////////////////\n"
	"// Public functions\n"
	"\n"
	"\n"
	"\n"
	"/**\n"
	" * Generate a local histogram from the sorted data counting how many entries of each set of 4 local digits we have.\n"
	" * This version just uses atomics to do summation because the number should be relatively small.\n"
	" *\n"
	" * alternative: Histogram has 32(local mem banks)*2^numBits entries which are reduced into the first 2^numBits elements in this function.\n"
	" * The rest of the histogram array is temporary data.\n"
	" * There are 32 histogram bins for each value to avoid the need for atomics.\n"
	" *\n"
	" * @param histogramOutputRadixMajor is ordered by radices - inefficient read and write but easily prefix summed.\n"
	" * This will be the global set of offsets.\n"
	" * @param histogramOutputGroupMajor is ordered by groups. This is the local set of offsets for the radix sort scatter to use\n"
	" * later that will not be globally summed.\n"
	" */\n"
	"void generateHistogram(\n"
	"		uint4 sortedData,\n"
	"		__local unsigned *histogram,\n"
	"		__global unsigned *histogramOutputRadixMajor,\n"
	"		__global unsigned *histogramOutputGroupMajor,\n"
	"		unsigned startBit,\n"
	"        unsigned numGroups)\n"
	"{\n"
	"    uint4 addresses;\n"
	"    addresses = (uint4)(get_local_id(0), get_local_id(0), get_local_id(0), get_local_id(0));\n"
	"    // This bit is best unvectorised as we can\'t write more than 2 values to local per WI anyway\n"
	"    if( get_local_id(0) < (1<<BITS_PER_PASS) )\n"
	"    {\n"
	"    	histogram[addresses.x] = 0;\n"
	"    }\n"
	"\n"
	"    addresses = addresses * (unsigned)4;\n"
	"    addresses.y = addresses.y+1;\n"
	"    addresses.z = addresses.z+2;\n"
	"    addresses.w = addresses.w+3;\n"
	"\n"
	"	// Obtain correct histogram address using only the BITS_PER_PASS bits that we\'re sorting by in this pass\n"
	"	sortedData.x >>= startBit;\n"
	"	sortedData.y >>= startBit;\n"
	"	sortedData.z >>= startBit;\n"
	"	sortedData.w >>= startBit;\n"
	"\n"
	"	int andValue = ((1<<BITS_PER_PASS)-1);\n"
	"	sortedData &= (uint4)(andValue, andValue, andValue, andValue);\n"
	"\n"
	"	// Alternative approach counting differences\n"
	"	// Doesn\'t work yet, and will probably never be faster anyway due to ifs\n"
	"	// only makes sense if atomics are *very* slow\n"
	"#if 0\n"
	"	histogram[16+get_local_id(0)] = sortedData.w;\n"
	"#endif\n"
	"\n"
	"    // Perform the atomic histogram updates\n"
	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"	// Alternative approach counting differences\n"
	"	// Doesn\'t work yet, and will probably never be faster anyway due to ifs\n"
	"	// only makes sense if atomics are *very* slow\n"
	"#if 0\n"
	"	uint previousValue = histogram[16-1+get_local_id(0)];\n"
	"	if(sortedData.x != previousValue && get_local_id(0) > 0)\n"
	"		histogram[previousValue] = get_local_id(0)*4;\n"
	"	if(sortedData.z != sortedData.y)\n"
	"		histogram[sortedData.y] = get_local_id(0)*4;\n"
	"	if(sortedData.w != sortedData.z)\n"
	"		histogram[sortedData.z] = get_local_id(0)*4;\n"
	"#else\n"
	"	atomic_inc( &(histogram[sortedData.x]) );\n"
	"	atomic_inc( &(histogram[sortedData.y]) );\n"
	"	atomic_inc( &(histogram[sortedData.z]) );\n"
	"	atomic_inc( &(histogram[sortedData.w]) );\n"
	"#endif\n"
	"\n"
	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"\n"
	"    // No need to vectorise this with only 16 values to process\n"
	"    if( get_local_id(0) < 16 )\n"
	"    {\n"
	"     	uint histValues;\n"
	"\n"
	"     	histValues = histogram[get_local_id(0)];\n"
	"\n"
	"     	unsigned globalOffset = 16*get_group_id(0);\n"
	"     	uint globalAddresses = get_local_id(0) + globalOffset;\n"
	"\n"
	"     	uint globalAddressesRadixMajor = numGroups;\n"
	"     	globalAddressesRadixMajor = globalAddressesRadixMajor * get_local_id(0);\n"
	"     	globalAddressesRadixMajor = globalAddressesRadixMajor + get_group_id(0);\n"
	"\n"
	"\n"
	"     	histogramOutputGroupMajor[globalAddresses] = histValues;\n"
	"     	histogramOutputRadixMajor[globalAddressesRadixMajor] = histValues;\n"
	"     }\n"
	"}\n"
	"\n"
	"\n"
	"\n"
	"#define LOCAL_SORT_WG_SIZE 128\n"
	"\n"
	"/*\n"
	" * Perform radix sort operation on a local array of unsigned ints\n"
	" * based on a particular set of bit entries of BITS_PER_PASS in size at\n"
	" * startbit in the data.\n"
	" * Outputs the data and a histogram representing the counts of all\n"
	" * BITS_PER_PASS sized values within the local memory array.\n"
	" * histogramOutput is stored as all the 0 bins, then all the 1 bins\n"
	" * and so on. Inefficient write out in here - simple prefix sum.\n"
	" */\n"
	"__attribute__((reqd_work_group_size(LOCAL_SORT_WG_SIZE,1,1)))\n"
	"__kernel void RadixSortLocal(\n"
	"        // uint8 - this is really 4 KeyValuePairs packed together\n"
	"		// There is probably a cleaner way to represent this\n"
	"		__global uint8 *dataToSort,\n"
	"\n"
	"		__global unsigned *histogramOutputGlobalRadixMajor,\n"
	"		__global unsigned *histogramOutputLocalGroupMajor,\n"
	"		int startBit,\n"
	"        int numGroups,\n"
	"		__local unsigned *sorterSharedMemory\n"
	"        )\n"
	"{\n"
	"\n"
	"	int numLocalElements = get_local_size(0)*ELEMENTS_PER_WORK_ITEM;\n"
	"\n"
	"    uint4 plainLocalAddress = (uint4)(get_local_id(0), get_local_id(0), get_local_id(0), get_local_id(0));\n"
	"    uint4 addValues = (uint4)(0,1,2,3);\n"
	"    plainLocalAddress = plainLocalAddress * (unsigned)4;\n"
	"    plainLocalAddress = plainLocalAddress + addValues;\n"
	"\n"
	"    // localAddress that\'s been converted to avoid conflicts\n"
	"    uint4 localAddress = CONVERT_CONFLICT_FREE_4VEC(plainLocalAddress);\n"
	"\n"
	"    uint4 localKeys;\n"
	"    uint4 localValues;\n"
	"	{\n"
	"        uint4 globalAddress =  (uint4)(get_group_id(0), get_group_id(0), get_group_id(0), get_group_id(0));\n"
	"        uint4 localElementsCount = (uint4)(numLocalElements, numLocalElements, numLocalElements, numLocalElements);\n"
	"        globalAddress = globalAddress*localElementsCount+localAddress;\n"
	"\n"
	"        // uint8 - this is really 4 KeyValuePairs packed together\n"
	"        // Probably a cleaner way to represent this\n"
	"        uint8 localData;\n"
	"		localData = dataToSort[globalAddress.x/4];\n"
	"		localKeys.s0 = localData.s0;\n"
	"		localKeys.s1 = localData.s2;\n"
	"		localKeys.s2 = localData.s4;\n"
	"		localKeys.s3 = localData.s6;\n"
	"		localValues.s0 = localData.s1;\n"
	"		localValues.s1 = localData.s3;\n"
	"		localValues.s2 = localData.s5;\n"
	"		localValues.s3 = localData.s7;\n"
	"	}\n"
	"\n"
	"	// Iterate over the block of bits we are sorting internally in this kernel\n"
	"	//for( int bitIndex = startBit; bitIndex < (startBit+BITS_PER_PASS ); ++bitIndex )\n"
	"	int bitIndex = startBit;\n"
	"	do\n"
	"	{\n"
	"		// Write a local mem value just after the range to be processed by the prefix sum\n"
	"\n"
	"		// TODO: 512??\n"
	"        if( get_local_id(0) == (get_local_size(0)-1) )\n"
	"        	sorterSharedMemory[256] = localKeys.w;\n"
	"\n"
	"        unsigned compare = (1<<bitIndex);\n"
	"        uint4 compareVec = (uint4)(compare, compare, compare, compare);\n"
	"\n"
	"        uint4 localCompareVec = localKeys & compareVec;\n"
	"\n"
	"        uint4 prefixSum;\n"
	"        prefixSum = select( (uint4)(1,1,1,1), (uint4)(0,0,0,0), localCompareVec != (uint4)(0,0,0,0) );\n"
	"\n"
	"        prefixSum = localPrefixSumBlock(prefixSum, sorterSharedMemory);\n"
	"\n"
	"        // Need to get totalFalses from somewhere - that\'s the value in [255]. Could just assume 255 of sorterSharedMemory is ok, but\n"
	"        // that\'s hardly clean\n"
	"        uint totalFalses = sorterSharedMemory[255];\n"
	"\n"
	"		/////////////////\n"
	"		// Perform the local sort\n"
	"        // Rearrange data using local memory\n"
	"        {\n"
	"        	uint4 localCompareVec = localKeys & compareVec;\n"
	"\n"
	"        	uint4 newAddress = plainLocalAddress - prefixSum;\n"
	"        	newAddress += (uint4)(totalFalses, totalFalses, totalFalses, totalFalses);\n"
	"\n"
	"        	newAddress = select( prefixSum, newAddress, localCompareVec != (uint4)(0, 0, 0, 0) );\n"
	"\n"
	"        	newAddress = CONVERT_CONFLICT_FREE_4VEC(newAddress);\n"
	"\n"
	"        	// Internally sort keys and then values\n"
	"        	// May be more efficient to do this in one go using the pairs\n"
	"        	barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"        	sorterSharedMemory[newAddress.x] = localKeys.x;\n"
	"        	sorterSharedMemory[newAddress.y] = localKeys.y;\n"
	"        	sorterSharedMemory[newAddress.z] = localKeys.z;\n"
	"        	sorterSharedMemory[newAddress.w] = localKeys.w;\n"
	"\n"
	"        	barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"            // Back up sort values to give thread-local store of current state of local mem\n"
	"        	localKeys.x = sorterSharedMemory[localAddress.x];\n"
	"        	localKeys.y = sorterSharedMemory[localAddress.y];\n"
	"        	localKeys.z = sorterSharedMemory[localAddress.z];\n"
	"        	localKeys.w = sorterSharedMemory[localAddress.w];\n"
	"\n"
	"    		barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"            sorterSharedMemory[newAddress.x] = localValues.x;\n"
	"            sorterSharedMemory[newAddress.y] = localValues.y;\n"
	"            sorterSharedMemory[newAddress.z] = localValues.z;\n"
	"            sorterSharedMemory[newAddress.w] = localValues.w;\n"
	"\n"
	"            barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"            // Back up sort values to give thread-local store of current state of local mem\n"
	"            localValues.x = sorterSharedMemory[localAddress.x];\n"
	"            localValues.y = sorterSharedMemory[localAddress.y];\n"
	"            localValues.z = sorterSharedMemory[localAddress.z];\n"
	"            localValues.w = sorterSharedMemory[localAddress.w];\n"
	"\n"
	"            barrier(CLK_LOCAL_MEM_FENCE);\n"
	"        }\n"
	"\n"
	"		bitIndex = bitIndex + 1;\n"
	"	} while( bitIndex < (startBit+BITS_PER_PASS ) );\n"
	"\n"
	"	// Generate local histogram and output to global histogram storage array\n"
	"	generateHistogram( localKeys, sorterSharedMemory, histogramOutputGlobalRadixMajor, histogramOutputLocalGroupMajor, startBit, numGroups );\n"
	"\n"
	"	{\n"
	"        uint4 globalAddress =  (uint4)(get_group_id(0), get_group_id(0), get_group_id(0), get_group_id(0));\n"
	"        uint4 localElementsCount = (uint4)(numLocalElements, numLocalElements, numLocalElements, numLocalElements);\n"
	"        globalAddress = globalAddress*localElementsCount+plainLocalAddress;\n"
	"\n"
	"        uint8 localData;\n"
	"        localData.s0 = localKeys.s0;\n"
	"        localData.s2 = localKeys.s1;\n"
	"        localData.s4 = localKeys.s2;\n"
	"        localData.s6 = localKeys.s3;\n"
	"        localData.s1 = localValues.s0;\n"
	"        localData.s3 = localValues.s1;\n"
	"        localData.s5 = localValues.s2;\n"
	"        localData.s7 = localValues.s3;\n"
	"\n"
	"        // 2* because we have key/value pairs now\n"
	"		dataToSort[globalAddress.x/4] = localData;\n"
	"	}\n"
	"}\n"
	"\n"
	"\n"
	"\n"
	"\n"
	"/*\n"
	" * ScanLargeArrays : Scan is done for each block and the sum of each\n"
	" * block is stored in separate array (sumBuffer). SumBuffer is scanned\n"
	" * and results are added to every value of next corresponding block to\n"
	" * compute the scan of a large array.(not limited to 2*MAX_GROUP_SIZE)\n"
	" * Scan uses a balanced tree algorithm. See Belloch, 1990 \"Prefix Sums\n"
	" * and Their Applications\"\n"
	" * @param output output data\n"
	" * @param input  input data\n"
	" * @param block  local memory used in the kernel\n"
	" * @param length length of the input data\n"
	" * @param sumBuffer  sum of blocks\n"
	" */\n"
	"__attribute__((reqd_work_group_size(128,1,1)))\n"
	"__kernel\n"
	"void ScanLargeArrays(\n"
	"		__global unsigned int *output,\n"
	"        __global unsigned int *input,\n"
	"        __local  unsigned int *block,	 // Size : block_size\n"
	"        const uint block_size,	 // size of block\n"
	"        const uint length,	 	 // no of elements\n"
	"        __global unsigned int *sumBuffer)  // sum of blocks\n"
	"\n"
	"{\n"
	"	int tid = get_local_id(0);\n"
	"	int gid = get_global_id(0);\n"
	"	int bid = get_group_id(0);\n"
	"\n"
	"	int offset = 1;\n"
	"\n"
	"    /* Cache the computational window in shared memory */\n"
	"    if( (2*gid + 1) < length )\n"
	"    {\n"
	"	   block[2*tid]     = input[2*gid];\n"
	"	   block[2*tid + 1] = input[2*gid + 1];\n"
	"    }  else {\n"
	"       block[2*tid]     = 0;\n"
	"       block[2*tid + 1] = 0;\n"
	"    }\n"
	"\n"
	"    /* build the sum in place up the tree */\n"
	"	for(int d = block_size>>1; d > 0; d >>=1)\n"
	"	{\n"
	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"		if(tid<d)\n"
	"		{\n"
	"			int ai = offset*(2*tid + 1) - 1;\n"
	"			int bi = offset*(2*tid + 2) - 1;\n"
	"\n"
	"			block[bi] += block[ai];\n"
	"		}\n"
	"		offset *= 2;\n"
	"	}\n"
	"\n"
	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"\n"
	"    /* store the value in sum buffer before making it to 0 */\n"
	"	sumBuffer[bid] = block[block_size - 1];\n"
	"\n"
	"	barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE);\n"
	"\n"
	"    /* scan back down the tree */\n"
	"\n"
	"    /* clear the last element */\n"
	"	block[block_size - 1] = 0;\n"
	"\n"
	"    /* traverse down the tree building the scan in the place */\n"
	"	for(int d = 1; d < block_size ; d *= 2)\n"
	"	{\n"
	"		offset >>=1;\n"
	"		barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"		if(tid < d)\n"
	"		{\n"
	"			int ai = offset*(2*tid + 1) - 1;\n"
	"			int bi = offset*(2*tid + 2) - 1;\n"
	"\n"
	"			unsigned int t = block[ai];\n"
	"			block[ai] = block[bi];\n"
	"			block[bi] += t;\n"
	"		}\n"
	"	}\n"
	"\n"
	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    /*write the results back to global memory */\n"
	"\n"
	"	if( (2*gid + 1) < length )\n"
	"    {\n"
	"        output[2*gid]     = block[2*tid];\n"
	"        output[2*gid + 1] = block[2*tid + 1];\n"
	"    }\n"
	"\n"
	"}\n"
	"\n"
	"/**\n"
	"  * ScanPropagateBlockSums : Takes as input blocks of data,\n"
	" * each individually prefix summed, as well as a set of block\n"
	" * sums that summarise the entire set.\n"
	" * Propagates the block sums into the prefix sum blocks.\n"
	" * @param scanArray the set of prefix summed blocks\n"
	" * @param block_size the number of data elements dealt with\n"
	" * by each work group. Need not relate directly to the\n"
	" * number of work items in a group, but should align for efficiency.\n"
	" * @param length length of the input data\n"
	" * @param sumBuffer  block sum array\n"
	" */\n"
	"__attribute__((reqd_work_group_size(128,1,1)))\n"
	"__kernel\n"
	"void ScanPropagateBlockSums(\n"
	"		__global unsigned int *scanArray,\n"
	"        const uint block_size,	 // size of block\n"
	"        const uint length,	 	 // no of elements\n"
	"        __global unsigned int *sumBuffer)  // sum of blocks\n"
	"{\n"
	"	// Get the appropriate block sum for this group\n"
	"	unsigned int blockSum = sumBuffer[get_group_id(0)+1];\n"
	"\n"
	"	// For entire block, add the sum to it\n"
	"\n"
	"	int endValue = min((get_group_id(0)+2)*(block_size), length);\n"
	"	for(\n"
	"			int i = (get_group_id(0)+1)*block_size + get_local_id(0);\n"
	"			i < endValue;\n"
	"			i += get_local_size(0))\n"
	"	{\n"
	"		scanArray[i] += blockSum;\n"
	"	}\n"
	"}\n"
	"\n"
	"/**\n"
	" * Do a scan on the block sum values which can then be propagated back to the blocks.\n"
	" * Currently this does not deal correctly with non power-of-two sizes (I think\n"
	" * certainly some sizes of data set fail to sort correctly, problem could be elsewhere).\n"
	" */\n"
	"__attribute__((reqd_work_group_size(128,1,1)))\n"
	"__kernel\n"
	"void SumBlockSums(__global unsigned *blockSums, unsigned numElements, __local unsigned *localMemory)\n"
	"{\n"
	"	int element;\n"
	"	for( element = get_local_id(0); element < numElements; element += get_local_size(0) )\n"
	"	{\n"
	"		localMemory[CONVERT_CONFLICT_FREE(element)] = blockSums[element];\n"
	"	}\n"
	"/*\n"
	"	for( ; element < 2048; element += get_local_size(0) )\n"
	"	{\n"
	"		localMemory[element + CONFLICT_FREE_OFFSET(element)] = 0;\n"
	"	}*/\n"
	"\n"
	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"	//localPrefixSum(localMemory, 2048);\n"
	"	localPrefixSum(localMemory, numElements);\n"
	"\n"
	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"	for( int element = get_local_id(0); element < (numElements); element += get_local_size(0) )\n"
	"	{\n"
	"		blockSums[element] = localMemory[CONVERT_CONFLICT_FREE(element)];\n"
	"	}\n"
	"}\n"
	"\n"
	"\n"
	"\n"
	"/**\n"
	" * RadixSortGlobal: Perform the global radix sort scatter from\n"
	" * pre-sorted local buffers and both local and global prefix-sum\n"
	" * offset information.\n"
	" * @param dataToSort contains the buffer of locally sorted blocks\n"
	" * @param histogramGlobalRadixMajor is radix major arranged\n"
	" * global prefix sum offset information for the bit block values.\n"
	" * @param histogramLocalGroupMajor contains per-group\n"
	" * local prefix sum information.\n"
	" * @param startBit is the first of BITS_PER_PASS bits in the key this pass is\n"
	" * sorting by.\n"
	" * @param sorterSharedMemory is the shared memory storage region,\n"
	" * divided up as necessary.\n"
	" */\n"
	"__attribute__((reqd_work_group_size(128,1,1)))\n"
	"__kernel\n"
	"void RadixSortGlobal(\n"
	"        // uint8 - this is really 4 KeyValuePairs packed together\n"
	"        // There is probably a cleaner way to represent this\n"
	"		__global uint8 *dataToSort,\n"
	"\n"
	"		__global unsigned *histogramGlobalRadixMajor,\n"
	"		__global unsigned *histogramLocalGroupMajor,\n"
	"		__global KeyValuePair *destinationArray,\n"
	"		unsigned startBit,\n"
	"		__local unsigned *sorterLocalMemory)\n"
	"{\n"
	"	// Move local histogram far enough through memory to allow for\n"
	"	// efficient local prefix sum\n"
	"	__local unsigned *localHistogram = sorterLocalMemory + 2*(1<<BITS_PER_PASS);\n"
	"	__local unsigned *globalHistogram = sorterLocalMemory;\n"
	"\n"
	"	// First load the local and global histogram data into local memory\n"
	"	// And do prefix sum of local histogram\n"
	"	// No point in vectorising this with only 16 bins\n"
	"	if( get_local_id(0) < ((1<<BITS_PER_PASS)/2) )\n"
	"	{\n"
	"		uint2 histElement = (uint2)(get_local_id(0), get_local_id(0)+8);\n"
	"\n"
	"		uint2 localValues;\n"
	"		globalHistogram[histElement.x] = histogramGlobalRadixMajor[get_num_groups(0)*histElement.x + get_group_id(0)];\n"
	"		globalHistogram[histElement.y] = histogramGlobalRadixMajor[get_num_groups(0)*histElement.y + get_group_id(0)];\n"
	"\n"
	"		localValues.x = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*get_group_id(0) + histElement.x];\n"
	"		localValues.y = histogramLocalGroupMajor[(1<<BITS_PER_PASS)*get_group_id(0) + histElement.y];\n"
	"		localHistogram[histElement.x] = localValues.x;\n"
	"		localHistogram[histElement.y] = localValues.y;\n"
	"\n"
	"		localHistogram[histElement.x-(1<<BITS_PER_PASS)] = 0;\n"
	"		localHistogram[histElement.y-(1<<BITS_PER_PASS)] = 0;\n"
	"\n"
	"		int idx = 2*get_local_id(0);\n"
	"		localHistogram[idx] += localHistogram[idx-1];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"		localHistogram[idx] += localHistogram[idx-2];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"		localHistogram[idx] += localHistogram[idx-4];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"		localHistogram[idx] += localHistogram[idx-8];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"		// Propagate intermediate values through\n"
	"		localHistogram[idx-1] += localHistogram[idx-2];\n"
	"		mem_fence(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"		// Grab and propagate for whole WG - loading the - 1 value\n"
	"		localValues.x = localHistogram[histElement.x-1];\n"
	"		localValues.y = localHistogram[histElement.y-1];\n"
	"\n"
	"		localHistogram[histElement.x] = localValues.x;\n"
	"		localHistogram[histElement.y] = localValues.y;\n"
	"\n"
	"	}\n"
	"\n"
	"\n"
	"	barrier(CLK_LOCAL_MEM_FENCE);\n"
	"\n"
	"    const int numLocalElements = 512;\n"
	"    uint4 localAddress = (uint4)(get_local_id(0), get_local_id(0), get_local_id(0), get_local_id(0));\n"
	"    localAddress = localAddress*(unsigned)4;\n"
	"    uint4 addValues = (uint4)(0,1,2,3);\n"
	"    localAddress = localAddress + addValues;\n"
	"    uint4 globalAddress = get_group_id(0)*numLocalElements + localAddress;\n"
	"\n"
	"    // uint8 - this is really 4 KeyValuePairs packed together\n"
	"    // There is probably a cleaner way to represent this\n"
	"	uint8 sortValue;\n"
	"	sortValue = dataToSort[globalAddress.x/4];\n"
	"\n"
	"	uint cmpValue = ((1<<BITS_PER_PASS)-1);\n"
	"	uint4 cmpValueVector = (uint4)(cmpValue, cmpValue, cmpValue, cmpValue);\n"
	"	uint4 radix;\n"
	"	// s0,2,4,6 are keys, 1,3,5,7 are values\n"
	"	radix.x = (sortValue.s0>>startBit);\n"
	"	radix.y = (sortValue.s2>>startBit);\n"
	"	radix.z = (sortValue.s4>>startBit);\n"
	"	radix.w = (sortValue.s6>>startBit);\n"
	"\n"
	"	radix = radix & cmpValueVector;\n"
	"\n"
	"	uint4 localOffsetIntoRadixSet;\n"
	"	localOffsetIntoRadixSet = localAddress;\n"
	"	localOffsetIntoRadixSet.x = localOffsetIntoRadixSet.x - localHistogram[radix.x];\n"
	"	localOffsetIntoRadixSet.y = localOffsetIntoRadixSet.y - localHistogram[radix.y];\n"
	"	localOffsetIntoRadixSet.z = localOffsetIntoRadixSet.z - localHistogram[radix.z];\n"
	"	localOffsetIntoRadixSet.w = localOffsetIntoRadixSet.w - localHistogram[radix.w];\n"
	"\n"
	"	uint4 globalOffset = localOffsetIntoRadixSet;\n"
	"	globalOffset.x = globalOffset.x + globalHistogram[radix.x];\n"
	"	globalOffset.y = globalOffset.y + globalHistogram[radix.y];\n"
	"	globalOffset.z = globalOffset.z + globalHistogram[radix.z];\n"
	"	globalOffset.w = globalOffset.w + globalHistogram[radix.w];\n"
	"\n"
	"	// s0,2,4,6 are keys, 1,3,5,7 are values\n"
	"	destinationArray[globalOffset.x] = (KeyValuePair)(sortValue.s0, sortValue.s1);\n"
	"	destinationArray[globalOffset.y] = (KeyValuePair)(sortValue.s2, sortValue.s3);\n"
	"	destinationArray[globalOffset.z] = (KeyValuePair)(sortValue.s4, sortValue.s5);\n"
	"	destinationArray[globalOffset.w] = (KeyValuePair)(sortValue.s6, sortValue.s7);\n"
	"}\n"
	"\n"
	"";